This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

library(ape)
library(TreeTools)
library(phangorn)
library(randomcoloR)
library(ggplot2)
library(dplyr)
library(uniqtag)
library(phytools)
library(ggstance)
library(ggtree)
library(grid)
library(gtable)
library(reshape2)
library(ggnewscale)
library(htmlwidgets)

Download global overview tab

raw.data<-read.csv("../../misc/data/GAGA_SPEC_TUBE_ECOL-GlobalOverview-210322.tsv",sep="\t",strip.white = T,stringsAsFactors = F,dec = ",",na.strings = c("","#DIV/0!"),skip = 2)

Load from google

# subset to entries with a Species name from the Assembly Overview
data<-subset(GlobalOverview,!is.na(Species..from.Assembly.Overview.) & !is.na(ENTRY.FROM.ASSEMBLY.OVERVIEW))

# rename Myrmoxenus to Temnothorax as the former is no proper genus
data$Species..from.Assembly.Overview.<-gsub("Myrmoxenus","Temnothorax",data$Species..from.Assembly.Overview.)

# Extract genus information 
data$genus<-gsub(" .*","",data$Species..from.Assembly.Overview.)

# Rename duplicate genera to Genus-sp1, Genus-sp2, etc.
data$taxID<-make_unique(data$genus,sep="-sp")

# show which genera are duplicated
table(gsub("-.*","",data$genus[duplicated(gsub(" .*","",data$Species..from.Assembly.Overview.))]))+1

   Acromyrmex Aphaenogaster          Atta    Camponotus      Carebara Crematogaster      Diacamma  Ectomomyrmex       Formica  Harpegnathos        Lasius 
            5             3             2             6             7             3             2             2             6             2             8 
   Leptogenys        Messor      Myrmecia       Myrmica  Odontomachus      Pheidole   Proceratium    Proformica   Rossomyrmex    Solenopsis      Tapinoma 
            3             2             3             3             3             5             2             2             2             2             2 
  Temnothorax   Tetramorium  Trachymyrmex 
            8             5             3 
# Downloaded ant tree from https://www.antwiki.org/wiki/Phylogeny_of_Formicidae
## The tree contains a few typos that I corrected manually (e.g. Paratrachina instead of Paratrechina, Myrmecosystus instead of Myrmecocystus)
treePaste<-paste(scan("../../misc/data/antTree.tre", what="character", sep=" "),collapse=" ")
Read 1 item
# expand the nodes for those species that we have multiple times
expansions<-table(data$genus[duplicated(data$genus)])+1
for (i in 1:length(expansions)){
  treePaste<-gsub(paste("(",names(expansions)[i],":)",sep=""),paste("(",paste(rep("\\1",expansions[i]),collapse=":0,",sep=""),":0):",sep=""),treePaste)
}

# load manipulated tree as tree object
raw.tree<-read.tree(text=treePaste)
raw.tree$tip.label<-make_unique(raw.tree$tip.label,sep="-sp")
# show labels for figuring out subfamilies
plot(raw.tree,show.tip.label = F)
92 branch length(s) NA(s): branch lengths ignored in the plot
nodelabels(frame =  "none")

# subset to extant ants, Martialis has to be node 1
tree<-Subtree(Preorder(raw.tree), 427)
grep("Martialis",tree$tip.label)
[1] 1
tree$tip.label<-gsub("some(.*[2-9])|some(.*)1","\\1\\2",tree$tip.label)
# plot with subfamilies

p<-ggtree(tree,size=.1)+xlim(0,40)+geom_tiplab(align=T,size=1)+ geom_text(aes(label=node),size=1,col="darkred")
Duplicated aesthetics after name standardisation: size
# Download species counts for all genera and other info from antwiki
#curl https://www.antwiki.org/wiki/images/a/ad/AntWiki_Valid_Genera.txt | LC_ALL=C sed 's/[^[:blank:][:print:]]//g'  > ../misc/data/AntWiki.clean.tsv
speciescount<-read.csv("../../misc/data/AntWiki.clean.tsv",sep="\t")
speciescount$Tribe[speciescount$Tribe==""]<-paste("tr. ",speciescount$Subfamily[speciescount$Tribe==""],sep="")
dataMerge<-merge(data,speciescount,by.x="genus",by.y="TaxonName",all.x=T,all.y=T)

dataMerge <- dataMerge %>%
  select(taxID, everything())

# give name to all unsequenced genera
dataMerge$taxID[is.na(dataMerge$taxID)]<-dataMerge$genus[is.na(dataMerge$taxID)]
# set species Count to NA for all multiple genera species
dataMerge$SpeciesCount[grep("-sp[2-9]",dataMerge$taxID,value = F)]<-NA

add data to tree

p<-ggtree(tree,size=.2,layout = "circular")+xlim(0,40)
#p$data$label<-make_unique(p$data$label,sep = "-sp")
#p$data$label[grep("NA.",p$data$label,ignore.case = F)]<-NA
dataMerge$DisplayName<-gsub(" \\(.*","",dataMerge$Species..from.Assembly.Overview.)
p2<-p %<+% dataMerge 

antGenera.tree <- p2 +
          geom_tiplab(align=T,size=1.2,offset=.5,linesize=0)+
          geom_tippoint(aes(size = SpeciesCount,fill=Tribe),pch=21,alpha=.5) + 
          guides(fill="none",color="none")+
          theme_tree()+
          theme(legend.position = c(.9,.85),legend.background = element_blank())
Duplicated aesthetics after name standardisation: sizeDuplicated aesthetics after name standardisation: size
antGenera.tree.w.GAGA <- p2 +
          geom_tiplab(aes(label=DisplayName,col=Tribe,alpha=ifelse(is.na(DisplayName),0,1)),align=T,size=1.8,offset = 8,linesize = 0)+
          geom_tiplab(align=T,size=1.2,offset=.5,linesize=0)+
          geom_tippoint(aes(size = SpeciesCount,fill=Tribe),pch=21,alpha=.5) + 
          guides(fill="none",color="none",alpha="none")+
          theme_tree()+
          theme(legend.position = c(.9,.85),legend.background = element_blank())
Duplicated aesthetics after name standardisation: sizeDuplicated aesthetics after name standardisation: sizeDuplicated aesthetics after name standardisation: sizeDuplicated aesthetics after name standardisation: size
antGenera.tree
ggsave("../../misc/plots/antTree.pdf",height=10)
Saving 7.29 x 10 in image

antGenera.tree.w.GAGA
ggsave("../../misc/plots/antTree.GAGA.pdf",height=10)
Saving 7.29 x 10 in image

# How many GAGA species are shown?
sum(!is.na(dataMerge$DisplayName))
[1] 163
GAGA.tree<-keep.tip(tree,antGenera.tree.w.GAGA$data$node[!is.na(antGenera.tree.w.GAGA$data$DisplayName)])

GAGA.tree.plot<-ggtree(GAGA.tree,size=.2) %<+% dataMerge 
GAGA.tree.plot2<-GAGA.tree.plot+geom_tiplab(aes(label=DisplayName,col=ifelse(grepl("GAGA",ENTRY.FROM.ASSEMBLY.OVERVIEW),"GAGA","other")),size=1.5,show.legend=F)+scale_color_manual(name="",values = c("black","grey60"))
GAGA.tree.plot2

GAGA.tree.plot3

# extract data from tree for facet plotting

facetData<-as.data.frame(GAGA.tree.plot2$data[GAGA.tree.plot2$data$isTip==T,])
facetData$id<-facetData$label
facetData$Genome.size<-as.numeric(facetData$Genome.size)
facetData$Scaffold.N50<-as.numeric(facetData$Scaffold.N50)
facetData$Contig.N50<-as.numeric(facetData$Contig.N50)
facetData$Number.of.scaffolds<-as.numeric(facetData$Number.of.scaffolds)

# extract BUSCO scores
facetData$BUSCO.hym.C<-as.numeric(gsub("C:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))
facetData$BUSCO.hym.F<-as.numeric(gsub(".*F:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))
facetData$BUSCO.hym.M<-as.numeric(gsub(".*M:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))

facetData$BUSCO.euk.C<-as.numeric(gsub("C:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))
facetData$BUSCO.euk.F<-as.numeric(gsub(".*F:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))
facetData$BUSCO.euk.M<-as.numeric(gsub(".*M:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))


dd <- data.frame(id=facetData$label, GS=round(facetData$Genome.size/1000000),N50=facetData$Scaffold.N50,nscf=facetData$Number.of.scaffolds,B.hym.C=facetData$BUSCO.hym.C,B.hym.F=facetData$BUSCO.hym.F,B.hym.M=facetData$BUSCO.hym.M,B.euk.C=facetData$BUSCO.euk.C,B.euk.F=facetData$BUSCO.euk.F,B.euk.M=facetData$BUSCO.euk.M,tribe=facetData$Tribe,PacBio=facetData$PacBio.1,stLFR=facetData$stLFR.2,HiC=facetData$Hi.C.1,RNAseq=facetData$RNA..additional.samples.still.under.seq..,Finalized=facetData$Current.state,wgs=facetData$Short.reads..WGS..1,Gid=facetData$GAGA.ID,status=facetData$Current.state)
dd$tribeColor <- cladeColor[match(dd$tribe, names(cladeColor))]

dd$PacBio[!is.na(dd$PacBio)]<-1
dd$PacBio[is.na(dd$PacBio)]<-0
dd$stLFR[!is.na(dd$stLFR)]<-1
dd$stLFR[is.na(dd$stLFR)]<-0
dd$HiC[!is.na(dd$HiC)]<-1
dd$HiC[is.na(dd$HiC)]<-0
dd$wgs[!is.na(dd$wgs)]<-1
dd$wgs[is.na(dd$wgs)]<-0

rownames(dd)<-dd$id

dd$RNAseq[!is.na(dd$RNAseq)]<-3
dd$RNAseq[is.na(dd$RNAseq)]<-0

dd$Finalized[grep("Final",dd$Finalized)]<-2
dd$Finalized[dd$Finalized!=2]<-0

dd$DisplayName<-facetData$DisplayName

dd$node<-facetData$node
dd$tech<-as.factor(paste(dd$PacBio,dd$HiC,dd$stLFR,sep=""))
levels(dd$tech)[levels(dd$tech)=="000"]<-"non-GAGA"
levels(dd$tech)[levels(dd$tech)=="100"]<-"PacBio"
levels(dd$tech)[levels(dd$tech)=="001"]<-"stLFR"
levels(dd$tech)[levels(dd$tech)=="101"]<-"P/s"
levels(dd$tech)[levels(dd$tech)=="110"]<-"P/H"
levels(dd$tech)[levels(dd$tech)=="111"]<-"P/H/s"


ddHeatmap<-dd[c("id","tribeColor","tribe","PacBio","stLFR","HiC","wgs","Finalized","RNAseq")]

#GAGA.tree.plot4<-facet_plot(GAGA.tree.plot3, 'Genome Size', data = dd, geom=geom_barh, mapping = aes(x = GS,fill=tribeColor), 
 #               stat='identity' )+ theme_tree2() + scale_fill_identity()
GAGA.tree.plot.heatmap<-gheatmap(GAGA.tree.plot3+ylim(0,165), ddHeatmap[,c(4:9)], offset=7, colnames=T, legend_title="Tech",  width = .25,colnames_position = "top", colnames_offset_y = 1,colnames_angle=65,hjust=0,font.size = 2.8)+
                  scale_fill_manual(values = c("grey90","lightgreen","darkred","lightblue"))+
                  theme(legend.position = "none")
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
GAGA.tree.plot.heatmap
ggsave("../../misc/plots/GAGA.tree.technique.pdf",GAGA.tree.plot.heatmap,width=6,height=10)

ddHeatmap2<-dd[c("id","tribeColor","tribe","B.hym.C","B.euk.C")]
colnames(ddHeatmap2)[4:5]<-c("Hymenopt.","Eukaryota")
row.names(ddHeatmap2)<-ddHeatmap2$id
heatcol <- c("steelblue","cyan", "red")
GAGA.tree.plot.busco<-gheatmap(GAGA.tree.plot3+ylim(0,165), ddHeatmap2[,c(4:5)], offset=7, colnames=T, legend_title="Tech",  width = .215,colnames_position = "top", colnames_offset_y = 1,colnames_angle=65,hjust=0,font.size = 2.5)+scale_fill_gradientn(colours = rev(heatcol),na.value = "grey90",name="% comp. Busco")+theme(legend.position = c(.2,.8),legend.background = element_blank())
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
Scale for 'fill' is already present. Adding another scale for 'fill', which will replace the existing scale.
GAGA.tree.plot.busco
ggsave("../../misc/plots/GAGA.tree.busco.pdf",GAGA.tree.plot.busco,width=6,height=10)


GAGA.tree.plot.N50<-facet_plot(GAGA.tree.plot.heatmap+new_scale("color"), 'scfN50 (Mb)', data = dd, geom=geom_point, mapping = aes(x = N50/1000000,color=tech),size=2)+ 
  theme_tree2()+
  scale_color_brewer(palette = "Set2" ,guide = guide_legend(nrow=2,label.position = "top"))+
  theme(legend.position = c(.9,.9),
        legend.background = element_rect(color="black",fill=rgb(0,0,0,0)),
        legend.margin = margin(0,2,2,2,"mm"),
        legend.spacing = unit(0,"mm"),
        legend.title = element_blank(),
        legend.key.height = unit(0,"mm"),
        legend.key.width = unit(0,"mm"),
        legend.box.spacing = unit(0,"mm"))+
        guides(fill="none")+
  ylim(0,170)
Scale for 'y' is already present. Adding another scale for 'y', which will replace the existing scale.
GAGA.tree.plot.N50

# change rel_width of panels
## see https://groups.google.com/g/bioc-ggtree/c/tZ0qkluBeGU/m/nkfWC9ixCwAJ

gt = ggplot_gtable(ggplot_build(GAGA.tree.plot.N50))
Removed 26 rows containing missing values (geom_point).
#gtable_show_layout(gt) # will show you the layout - very handy function
#gt # see plot layout in table format
gt$layout$l[grep('panel-1', gt$layout$name)] # you want to find the column specific to panel-2
[1] 5 7
gt$widths[5] = 1.9*gt$widths[5] # in this case it was colmun 7 - reduce the width by a half
#grid.draw(gt) # plot with grid draw
grid.draw(gt)
ggsave("../../misc/plots/GAGA.tree.n50.pdf",grid.draw(gt),width=10,height=9)
ggsave("../../misc/plots/GAGA.assemblyQuality.pdf",assembly.quality.plot2,width=6,height=6)
ggsave("../../misc/plots/GAGA.assemblyQuality.boxplots.pdf",GAGA.quality.boxplots,width=6,height=6)
dd$Subfamily<-facetData$Subfamily.y
dd$DisplayNameU<-make_unique(dd$DisplayName,sep="")
dd$DisplayNameU<- factor(dd$DisplayNameU, levels = dd$DisplayNameU[rev(order(dd$Subfamily,dd$GS))])
---
title: "R Notebook"
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
library(ape)
library(TreeTools)
library(phangorn)
library(randomcoloR)
library(ggplot2)
library(dplyr)
library(uniqtag)
library(phytools)
library(ggstance)
library(ggtree)
library(grid)
library(gtable)
library(reshape2)
library(ggnewscale)
library(htmlwidgets)
```


Download global overview tab
```{r}
#raw.data<-read.csv("../../misc/data/GAGA_SPEC_TUBE_ECOL-GlobalOverview-210322.tsv",sep="\t",strip.white = T,stringsAsFactors = F,dec = ",",na.strings = c("","#DIV/0!"),skip = 2)

```

Load from google
```{R message=FALSE, warning=FALSE, include=FALSE}
#id<-"1e7Hj_BQ8rke5XW7j93YL6QVAGbcvsb2yw7oD86WnPJY"
#gsheet<-read_sheet(id,sheet="GlobalOverview",skip = 2,na = c(""," ","NA","#DIV/0!"),col_types = "c",trim_ws=T)
#GlobalOverview<-as.data.frame(gsheet,stringsAsFactors=F)


#colnames(GlobalOverview)<-c("RNAseqID","RNA.Comment","RNA.Worker","RNA.Large worker","RNA.Smallworker","RNA.Soldier","RNA.Gyne","RNA.Queen","RNA.Male","RNA.Larvae","RNA.Pupae","RNA.Brood","RNA.Mixed","ENTRY.FROM.Species.ID.polished","original.species.ID","final.species.ID","ENTRY.FROM.SampleType","PacBIo","stLFR","HiC","Collector.Firstname","Collector.Name","Sample.ID","Species","PacBio.library","stLFR.library","Short.reads..WGS..library","RNAseq.library","PacBio","stLFR.1","Short.reads..WGS.","Hi.C","RNAseq","Subfamily","Country","Comment","ENTRY.FROM.GAGA.SUBMISSION","GAGA.ID","ENTRY.FROM.ASSEMBLY.OVERVIEW","Species..from.Assembly.Overview.","PacBio.1","stLFR.2","Short.reads..WGS..1","Hi.C.1","RNA..additional.samples.still.under.seq..","PacBio.assembly","PacBio.stLFR.WGS.polished.assembly","PacBio.stLFR.scaffolded","PacBio.Hi.C.scaffolded","stLFR.assembly","Current.state","Genome.size","Number.of.scaffolds","Scaffold.N50","Contig.N50","Longest.scaffold","BUSCO.Eukaryota","BUSCO.Hymenoptera","Duplicated.scaffolds","Total.length.of.duplicated.scaffolds","Percentage.of.duplication","Comments","Bacterial.scaffolds.count","Total.length.of.bacterial.scaffolds","Bacterial.scaffold.prevalence..","Top.bacterial.taxon","Putative.misassembly.count","Putative.misassemblies...100Kb..count","Genome.size.1","Number.of.scaffolds.1","Scaffold.N50.1","Contig.N50.1","Longest.scaffold.1","BUSCO.Eukaryota.1","BUSCO.Hymenoptera.1","Repeat.annotation","Gene.annotation")

#save(file = "../../misc/data/globaloverview.Robject",GlobalOverview)
load(file = "../../misc/data/globaloverview.Robject")
```

```{r}
# subset to entries with a Species name from the Assembly Overview
data<-subset(GlobalOverview,!is.na(Species..from.Assembly.Overview.) & !is.na(ENTRY.FROM.ASSEMBLY.OVERVIEW))

# rename Myrmoxenus to Temnothorax as the former is no proper genus
data$Species..from.Assembly.Overview.<-gsub("Myrmoxenus","Temnothorax",data$Species..from.Assembly.Overview.)

# Extract genus information 
data$genus<-gsub(" .*","",data$Species..from.Assembly.Overview.)

# Rename duplicate genera to Genus-sp1, Genus-sp2, etc.
data$taxID<-make_unique(data$genus,sep="-sp")

# show which genera are duplicated
table(gsub("-.*","",data$genus[duplicated(gsub(" .*","",data$Species..from.Assembly.Overview.))]))+1

```


```{r}
# Downloaded ant tree from https://www.antwiki.org/wiki/Phylogeny_of_Formicidae
## The tree contains a few typos that I corrected manually (e.g. Paratrachina instead of Paratrechina, Myrmecosystus instead of Myrmecocystus)
treePaste<-paste(scan("../../misc/data/antTree.tre", what="character", sep=" "),collapse=" ")

# expand the nodes for those species that we have multiple times
expansions<-table(data$genus[duplicated(data$genus)])+1
for (i in 1:length(expansions)){
  treePaste<-gsub(paste("(",names(expansions)[i],":)",sep=""),paste("(",paste(rep("\\1",expansions[i]),collapse=":0,",sep=""),":0):",sep=""),treePaste)
}

# load manipulated tree as tree object
raw.tree<-read.tree(text=treePaste)
raw.tree$tip.label<-make_unique(raw.tree$tip.label,sep="-sp")
```


```{r}
# show labels for figuring out subfamilies
plot(raw.tree,show.tip.label = F)
nodelabels(frame =  "none")
```


```{r}
# subset to extant ants, Martialis has to be node 1
tree<-Subtree(Preorder(raw.tree), 427)
grep("Martialis",tree$tip.label)
tree$tip.label<-gsub("some(.*[2-9])|some(.*)1","\\1\\2",tree$tip.label)
```

```{r}
# plot with subfamilies

p<-ggtree(tree,size=.1)+xlim(0,40)+geom_tiplab(align=T,size=1)+ geom_text(aes(label=node),size=1,col="darkred")
```




```{bash}
# Download species counts for all genera and other info from antwiki
#curl https://www.antwiki.org/wiki/images/a/ad/AntWiki_Valid_Genera.txt | LC_ALL=C sed 's/[^[:blank:][:print:]]//g'  > ../misc/data/AntWiki.clean.tsv
```
```{r}
speciescount<-read.csv("../../misc/data/AntWiki.clean.tsv",sep="\t")
speciescount$Tribe[speciescount$Tribe==""]<-paste("tr. ",speciescount$Subfamily[speciescount$Tribe==""],sep="")
```
```{r}
dataMerge<-merge(data,speciescount,by.x="genus",by.y="TaxonName",all.x=T,all.y=T)

dataMerge <- dataMerge %>%
  select(taxID, everything())

# give name to all unsequenced genera
dataMerge$taxID[is.na(dataMerge$taxID)]<-dataMerge$genus[is.na(dataMerge$taxID)]
# set species Count to NA for all multiple genera species
dataMerge$SpeciesCount[grep("-sp[2-9]",dataMerge$taxID,value = F)]<-NA
```


 add data to tree
```{r}
p<-ggtree(tree,size=.2,layout = "circular")+xlim(0,40)
#p$data$label<-make_unique(p$data$label,sep = "-sp")
#p$data$label[grep("NA.",p$data$label,ignore.case = F)]<-NA
dataMerge$DisplayName<-gsub(" \\(.*","",dataMerge$Species..from.Assembly.Overview.)
p2<-p %<+% dataMerge 
```



```{r}

antGenera.tree <- p2 +
          geom_tiplab(align=T,size=1.2,offset=.5,linesize=0)+
          geom_tippoint(aes(size = SpeciesCount,fill=Tribe),pch=21,alpha=.5) + 
          guides(fill="none",color="none")+
          theme_tree()+
          theme(legend.position = c(.9,.85),legend.background = element_blank())

antGenera.tree.w.GAGA <- p2 +
          geom_tiplab(aes(label=DisplayName,col=Tribe,alpha=ifelse(is.na(DisplayName),0,1)),align=T,size=1.8,offset = 8,linesize = 0)+
          geom_tiplab(align=T,size=1.2,offset=.5,linesize=0)+
          geom_tippoint(aes(size = SpeciesCount,fill=Tribe),pch=21,alpha=.5) + 
          guides(fill="none",color="none",alpha="none")+
          theme_tree()+
          theme(legend.position = c(.9,.85),legend.background = element_blank())

```

```{r}
antGenera.tree
ggsave("../../misc/plots/antTree.pdf",height=10)

antGenera.tree.w.GAGA
ggsave("../../misc/plots/antTree.GAGA.pdf",height=10)
# How many GAGA species are shown?
sum(!is.na(dataMerge$DisplayName))
```



```{r}
GAGA.tree<-keep.tip(tree,antGenera.tree.w.GAGA$data$node[!is.na(antGenera.tree.w.GAGA$data$DisplayName)])

GAGA.tree.plot<-ggtree(GAGA.tree,size=.2) %<+% dataMerge 
GAGA.tree.plot2<-GAGA.tree.plot+geom_tiplab(aes(label=DisplayName,col=ifelse(grepl("GAGA",ENTRY.FROM.ASSEMBLY.OVERVIEW),"GAGA","other")),size=1.5,show.legend=F)+scale_color_manual(name="",values = c("black","grey60"))
```

```{r}
GAGA.tree.plot2
```


```{r eval=FALSE, include=FALSE}

# Add tribe info as clade highlights

GAGA.tree.plot2$data$Tribe<-as.factor(GAGA.tree.plot2$data$Tribe)
TribeNodes<-vector()

tribes<-levels(GAGA.tree.plot2$data$Tribe)
tribeCount<-table(GAGA.tree.plot2$data$Tribe)

# loop over all tribes and extract the MRCA node of all genera for each tribe
for (i in 1:length(tribes)){
  nodes<-unlist(subset(GAGA.tree.plot2$data,Tribe==tribes[i],node,include.self=T))
  if (length(nodes)==1){
    TribeNodes[i]<-nodes
  }else{
    TribeNodes[i]<-mrca.phylo(GAGA.tree,nodes)
  }
}

#define clade colors
set.seed(1)
cladeColor<-distinctColorPalette(length(TribeNodes))
 
names(TribeNodes)<-tribes
names(cladeColor)<-tribes
tribecolors<-data.frame(tribe=names(cladeColor),color=cladeColor)
#write.table(tribecolors,file="../../misc/data/tribeColors.tsv",sep="\t",quote=F, row.names=F)
GAGA.tree.plot3<-GAGA.tree.plot2

# loop over each tribe and add a clade highlight. Label the highlight only for larger clades
for (tribe in tribes){
  if (tribeCount[tribe]>4){
    GAGA.tree.plot3<-GAGA.tree.plot3+geom_cladelabel(node=TribeNodes[tribe],label= tribe,align = T, offset = 5, col=c(cladeColor[tribe],"black"),barsize = 1,angle = 90,fontsize = 2,offset.text = .5,hjust = .5)+xlim_tree(25)
  }else{
    GAGA.tree.plot3<-GAGA.tree.plot3+geom_cladelabel(node=TribeNodes[tribe],label= "",align = T, offset = 5, col=cladeColor[tribe],barsize = 1)+xlim_tree(30)
  }
}

```

```{r}
GAGA.tree.plot3
```

```{r}
# extract data from tree for facet plotting

facetData<-as.data.frame(GAGA.tree.plot2$data[GAGA.tree.plot2$data$isTip==T,])
facetData$id<-facetData$label
facetData$Genome.size<-as.numeric(facetData$Genome.size)
facetData$Scaffold.N50<-as.numeric(facetData$Scaffold.N50)
facetData$Contig.N50<-as.numeric(facetData$Contig.N50)
facetData$Number.of.scaffolds<-as.numeric(facetData$Number.of.scaffolds)

# extract BUSCO scores
facetData$BUSCO.hym.C<-as.numeric(gsub("C:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))
facetData$BUSCO.hym.F<-as.numeric(gsub(".*F:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))
facetData$BUSCO.hym.M<-as.numeric(gsub(".*M:(.*?)%.*","\\1",facetData$BUSCO.Hymenoptera))

facetData$BUSCO.euk.C<-as.numeric(gsub("C:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))
facetData$BUSCO.euk.F<-as.numeric(gsub(".*F:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))
facetData$BUSCO.euk.M<-as.numeric(gsub(".*M:(.*?)%.*","\\1",facetData$BUSCO.Eukaryota))


dd <- data.frame(id=facetData$label, GS=round(facetData$Genome.size/1000000),N50=facetData$Scaffold.N50,nscf=facetData$Number.of.scaffolds,B.hym.C=facetData$BUSCO.hym.C,B.hym.F=facetData$BUSCO.hym.F,B.hym.M=facetData$BUSCO.hym.M,B.euk.C=facetData$BUSCO.euk.C,B.euk.F=facetData$BUSCO.euk.F,B.euk.M=facetData$BUSCO.euk.M,tribe=facetData$Tribe,PacBio=facetData$PacBio.1,stLFR=facetData$stLFR.2,HiC=facetData$Hi.C.1,RNAseq=facetData$RNA..additional.samples.still.under.seq..,Finalized=facetData$Current.state,wgs=facetData$Short.reads..WGS..1,Gid=facetData$GAGA.ID,status=facetData$Current.state)
dd$tribeColor <- cladeColor[match(dd$tribe, names(cladeColor))]

dd$PacBio[!is.na(dd$PacBio)]<-1
dd$PacBio[is.na(dd$PacBio)]<-0
dd$stLFR[!is.na(dd$stLFR)]<-1
dd$stLFR[is.na(dd$stLFR)]<-0
dd$HiC[!is.na(dd$HiC)]<-1
dd$HiC[is.na(dd$HiC)]<-0
dd$wgs[!is.na(dd$wgs)]<-1
dd$wgs[is.na(dd$wgs)]<-0

rownames(dd)<-dd$id

dd$RNAseq[!is.na(dd$RNAseq)]<-3
dd$RNAseq[is.na(dd$RNAseq)]<-0

dd$Finalized[grep("Final",dd$Finalized)]<-2
dd$Finalized[dd$Finalized!=2]<-0

dd$DisplayName<-facetData$DisplayName

dd$node<-facetData$node
dd$tech<-as.factor(paste(dd$PacBio,dd$HiC,dd$stLFR,sep=""))
levels(dd$tech)[levels(dd$tech)=="000"]<-"non-GAGA"
levels(dd$tech)[levels(dd$tech)=="100"]<-"PacBio"
levels(dd$tech)[levels(dd$tech)=="001"]<-"stLFR"
levels(dd$tech)[levels(dd$tech)=="101"]<-"P/s"
levels(dd$tech)[levels(dd$tech)=="110"]<-"P/H"
levels(dd$tech)[levels(dd$tech)=="111"]<-"P/H/s"


ddHeatmap<-dd[c("id","tribeColor","tribe","PacBio","stLFR","HiC","wgs","Finalized","RNAseq")]

#GAGA.tree.plot4<-facet_plot(GAGA.tree.plot3, 'Genome Size', data = dd, geom=geom_barh, mapping = aes(x = GS,fill=tribeColor), 
 #               stat='identity' )+ theme_tree2() + scale_fill_identity()
```


```{r}
GAGA.tree.plot.heatmap<-gheatmap(GAGA.tree.plot3+ylim(0,165), ddHeatmap[,c(4:9)], offset=7, colnames=T, legend_title="Tech",  width = .25,colnames_position = "top", colnames_offset_y = 1,colnames_angle=65,hjust=0,font.size = 2.8)+
                  scale_fill_manual(values = c("grey90","lightgreen","darkred","lightblue"))+
                  theme(legend.position = "none")


```
```{r}
GAGA.tree.plot.heatmap
ggsave("../../misc/plots/GAGA.tree.technique.pdf",GAGA.tree.plot.heatmap,width=6,height=10)
```

```{r}
ddHeatmap2<-dd[c("id","tribeColor","tribe","B.hym.C","B.euk.C")]
colnames(ddHeatmap2)[4:5]<-c("Hymenopt.","Eukaryota")
row.names(ddHeatmap2)<-ddHeatmap2$id
```

```{r}
heatcol <- c("steelblue","cyan", "red")
GAGA.tree.plot.busco<-gheatmap(GAGA.tree.plot3+ylim(0,165), ddHeatmap2[,c(4:5)], offset=7, colnames=T, legend_title="Tech",  width = .215,colnames_position = "top", colnames_offset_y = 1,colnames_angle=65,hjust=0,font.size = 2.5)+scale_fill_gradientn(colours = rev(heatcol),na.value = "grey90",name="% comp. Busco")+theme(legend.position = c(.2,.8),legend.background = element_blank())
```
```{r}
GAGA.tree.plot.busco
ggsave("../../misc/plots/GAGA.tree.busco.pdf",GAGA.tree.plot.busco,width=6,height=10)
```


```{r}

GAGA.tree.plot.N50<-facet_plot(GAGA.tree.plot.heatmap+new_scale("color"), 'scfN50 (Mb)', data = dd, geom=geom_point, mapping = aes(x = N50/1000000,color=tech),size=2)+ 
  theme_tree2()+
  scale_color_brewer(palette = "Set2" ,guide = guide_legend(nrow=2,label.position = "top"))+
  theme(legend.position = c(.9,.9),
        legend.background = element_rect(color="black",fill=rgb(0,0,0,0)),
        legend.margin = margin(0,2,2,2,"mm"),
        legend.spacing = unit(0,"mm"),
        legend.title = element_blank(),
        legend.key.height = unit(0,"mm"),
        legend.key.width = unit(0,"mm"),
        legend.box.spacing = unit(0,"mm"))+
        guides(fill="none")+
  ylim(0,170)

```
```{r}
GAGA.tree.plot.N50
```


```{r}
# change rel_width of panels
## see https://groups.google.com/g/bioc-ggtree/c/tZ0qkluBeGU/m/nkfWC9ixCwAJ

gt = ggplot_gtable(ggplot_build(GAGA.tree.plot.N50))
#gtable_show_layout(gt) # will show you the layout - very handy function
#gt # see plot layout in table format
gt$layout$l[grep('panel-1', gt$layout$name)] # you want to find the column specific to panel-2
gt$widths[5] = 1.9*gt$widths[5] # in this case it was colmun 7 - reduce the width by a half
#grid.draw(gt) # plot with grid draw
```


```{r}
grid.draw(gt)
ggsave("../../misc/plots/GAGA.tree.n50.pdf",grid.draw(gt),width=10,height=9)

```




```{R message=FALSE, warning=FALSE, include=FALSE}
dd$RNAseq<-as.factor(dd$RNAseq)
dd$N50mb<-round(dd$N50/1000000,2)
levels(dd$RNAseq)<-c("N","Y")
```


```{R message=FALSE, warning=FALSE, include=FALSE}
assembly.quality.plot1<- ggplot(dd) + 
                        geom_point(aes(x=nscf,y=N50mb,size=GS,fill=tech,col=RNAseq,species=DisplayName),stroke=.9,pch=21,alpha=.5) +
                        theme_classic() + 
                        scale_x_log10(labels=c("10","100","1k","10k","20k"),breaks=c(10,100,1000,10000,200000)) + 
                        scale_color_manual(values=c(rgb(1,1,1,0),"red"))

assembly.quality.plot2<-assembly.quality.plot1+ 
                        xlab("# scaffolds") + 
                        ylab("scaffold N50 (Mb)")+
                        theme(legend.position = c(.8, .65), 
                              legend.spacing = unit(.1, 'cm'),
                              legend.key.height = unit(0.3,"cm"),
                              legend.text = element_text(margin = margin(t = 2)),
                              legend.box.background = element_rect(colour = "black",fill=NA),
                              legend.title = element_text(size=8),
                              legend.background =  element_rect(colour = NA, fill = NA))
```

```{r}
ggsave("../../misc/plots/GAGA.assemblyQuality.pdf",assembly.quality.plot2,width=6,height=6)
```

```{R message=FALSE, warning=FALSE, include=FALSE}
p1.raw<-ggplotly(assembly.quality.plot2,tooltip = c("N50mb","DisplayName")) %>%layout(legend = list(orientation = "h", x = 0, y =-0.1))
p1<-ggplotly(assembly.quality.plot2,tooltip = c("N50mb","DisplayName")) %>%layout(legend = list(orientation = "h", x = 0, y =-0.1))

# change legend info for plotly plot
for (i in 1:length(p1$x$data)){
  p1$x$data[[i]]$legendgroup<-gsub(".*","",p1$x$data[[i]]$legendgroup)
  p1$x$data[[i]]$name<-gsub("\\(|\\)","",p1$x$data[[i]]$name)
}

saveWidget(as_widget(p1), "../../misc/plots/GAGA.assemblyQuality.html")

```

```{R message=FALSE, warning=FALSE, include=FALSE}
#prepare boxplots and calculate best/worst metrics
GAGA.quality.boxplots<-ggplot(dd,aes(y=N50mb,x=tech,fill=tech))+geom_boxplot(outlier.colour = NA)+geom_jitter(alpha=.2,width = 0.2)+scale_y_log10()+theme_classic()+theme(legend.position = c(.8, .2),axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+xlab("")+ylab("scaffold N50 (Mb)")
```
```{r}
ggsave("../../misc/plots/GAGA.assemblyQuality.boxplots.pdf",GAGA.quality.boxplots,width=6,height=6)
```

```{r}
dd$Subfamily<-facetData$Subfamily.y
dd$DisplayNameU<-make_unique(dd$DisplayName,sep="")
dd$DisplayNameU<- factor(dd$DisplayNameU, levels = dd$DisplayNameU[rev(order(dd$Subfamily,dd$GS))])
```


```{R message=FALSE, warning=FALSE, include=FALSE}
#compute genome size plot
dd.tmp<-dd
dd.tmp$Subfamily[dd.tmp$Subfamily=="Pseudomyrmecinae"]<-"Pseudomyrm."
GAGA.GenomeSize<-ggplot(subset(dd.tmp,!is.na(GS)),aes(x=DisplayNameU,y=GS))

GAGA.GenomeSize2<-GAGA.GenomeSize+geom_col(aes(fill=Subfamily),col=1,size=.2) + 
  coord_flip() + 
  scale_fill_viridis_d("Subfamily") + 
  ylab("Genome Size (Mb)")+
  xlab("Species")+
  theme_classic()+
  theme(legend.position = "bottom",axis.text.y=element_text(size=6.5))+
  geom_point(aes(col=B.hym.C),size=1.5)+
  guides(col = guide_legend(ncol = 1, title.position = "top",title = "Hym. BUSCO %"),
        fill = guide_legend(ncol = 4, title.position = "top"))+
  theme(axis.text.y = element_text(face = "italic"))

```

```{R message=FALSE, warning=FALSE, include=FALSE}
ggsave("../../misc/plots/GAGA.genomesize.pdf",GAGA.GenomeSize2,width=8,height=15)
```
